############################################################################
#####################        function define     ###########################
############################################################################
library(amap)
library(VennDiagram)
cccol <- c("#CE0013","#16557A","#C7A609","#87C232","#64C0AB","#A14C94","#15A08C","#8B7E75","#1E7CAF","#EA425F","#46489A","#E50033","#0F231F","#1187CD")

############################################################################
#####################           read in data     ###########################
############################################################################
data <- read.table("../data/nsmb.2660-S2.txt",header=T,row.names=1)
Oocyte <- 1:3; Zygote <- 4:6; cell2 <- 7:12; cell4 <- 13:24; cell8 <- 25:44; Morula <- 45:60; MTE <- c(64,66,67,69,72,76:79);PTE <- c(61:63,65,68,70,71,81,82); PE <- c(84:90);EPI <- c(73:75,80,83);hESC0 <- 91:98; hESC10 <- 99:124
avg <- cbind(apply(data[,Oocyte],1,mean),apply(data[,Zygote],1,mean),apply(data[,cell2],1,mean),apply(data[,cell4],1,mean),apply(data[,cell8],1,mean),apply(data[,Morula],1,mean),apply(data[,MTE],1,mean),apply(data[,PTE],1,mean),apply(data[,PE],1,mean),apply(data[,EPI],1,mean),apply(data[,hESC0],1,mean),apply(data[,hESC10],1,mean))
time_point <- c("Oocyte","Zygote","X2cell","X4cell","X8cell","Morula","MTE","PTE","PE","EPI","hESC0","hESC10")
dev_labels <- c("Oocyte","Zygote","2cell","4cell","8cell","Morula","MTE","PTE","PE","EPI","hESC0","hESC10")
colnames(avg) <- time_point
development_path <- time_point
dData <- log2(avg+1)

############ 2nd naive RNAseq
logfpkm2nd <- read.table("../data/2nd.reprogramming.lg2.all.fpkm.txt",header=T,row.names=1)
n_path <- c("hiF_r1","hiF_r2","he0_r1","he0_r2","he2_r1","he2_r2","he6_r1","he6_r2","n8_r1","n8_r2","n8_r3","n12_r1","n12_r2","n14_r1","n14_r2","n14_r3","n20_r1","n20_r2","n20_r3","n24p_r1","n24p_r2","n24m_r1","n24m_r2","niPS_r1","niPS_r2")
nData_tmp <- logfpkm2nd[,n_path]
nfpkm2nd <- 2**nData_tmp - 1

n_time_point <- c("hiF","he0","he2","he6","n8","n12","n14","n20","n24p","n24m","niPS")
n_label <- c("hiF-T","0d","2d","6d","8d","12d","14d","20d","24d+dox","24d-dox","niPSC-T")
nData2ndfpkm <- cbind(apply(nfpkm2nd[,1:2],1,mean),apply(nfpkm2nd[,3:4],1,mean),apply(nfpkm2nd[,5:6],1,mean),apply(nfpkm2nd[,7:8],1,mean),apply(nfpkm2nd[,9:11],1,mean),apply(nfpkm2nd[,12:13],1,mean),apply(nfpkm2nd[,14:16],1,mean),apply(nfpkm2nd[,17:19],1,mean),apply(nfpkm2nd[,20:21],1,mean),apply(nfpkm2nd[,22:23],1,mean),apply(nfpkm2nd[,24:25],1,mean))
colnames(nData2ndfpkm) <- n_time_point
rownames(nData2ndfpkm) <- rownames(nfpkm2nd)
nData <- log2(nData2ndfpkm+1)

n_deg <- read.table("../Fig2/Gfold/cutoff.0.58/naive.2nd.deg")[,1]
n_deg <- intersect(n_deg,rownames(nData))
nData <- nData[n_deg,]

############ 2nd primed RNAseq
pData2ndfpkm <- read.table("../data/paper.primed.fpkm.txt",header=T,row.names=1)
pData <- log2(pData2ndfpkm+1)

common_time_point <- c("hiF-T","2d","6d","8d","14d","20d","24d+dox","24d-dox","iPSC-T")

############################################################################
###########        specific 8c genes/ cluster 8c genes         #############
############################################################################
develop_8cell <- apply(avg,1,SpecificGene,"X8cell",1)
develop_8cell_gene <- names(sort(develop_8cell,decreasing=T)[1:500])
develop_8cell_gene <- intersect(intersect(develop_8cell_gene,rownames(pData2ndfpkm)),rownames(nData2ndfpkm))

cluster_8cell_genes <- as.vector(read.table("../data/kmcluster_36_naive2nd.txt")[,1])
cluster_8cell_genes <- intersect(intersect(cluster_8cell_genes,rownames(pData2ndfpkm)),rownames(nData2ndfpkm))

eight_genes <- intersect(cluster_8cell_genes,row.names(nData))

############################################################################
##############                      plot                  ##################
############################################################################
library(amap)
k <- 14
set.seed(4)

km <- kmeans(nData,k)
nData <- log2(nData2ndfpkm[,]+1)
n_deg <- intersect(n_deg,rownames(nData))
nData <- nData[n_deg,]

km <- Kmeans(nData,k,method = "correlation")

selected_cluster <- c(10,11,4,5,8,6,2)

total_number <- nrow(nData)
pval <- c()
ratio <- c()
cluster_ratio <- c()
number <- c()
for (each_cluster in seq(k)){
	GroupB <- names(which(km$cluster==each_cluster))
	common_number <- length(intersect(eight_genes,GroupB))
	a_number <- length(eight_genes)
	b_number <- length(GroupB)
	number <- c(number,common_number)
	pval <- c(pval,1-phyper(common_number-1, a_number, total_number-a_number, b_number))
	ratio <- c(ratio,common_number/a_number)
	cluster_ratio <- c(cluster_ratio,common_number/b_number)
	print(c(common_number,a_number-common_number,b_number-common_number,total_number-a_number-b_number+common_number))
}

pdf("SFig3A.pdf",width=3.5,height=5)
selected_cluster_name <- c("early embryogenesis","pre-implantation","early somatic","late somatic","late embryogenesis","metabolic","placenta development")
par(mar=c(12,4,2,2))
barplot(-log10(pval[selected_cluster]),names.arg=selected_cluster_name,col=c(rep("black",6),cccol[1]),border=NA,ylab="-log10(p-value)",las=2)
abline(h=2,lty=2)
dev.off()

# pval[selected_cluster]
# [1] 9.959387e-01 7.489600e-01 6.335871e-01 4.072914e-01 3.517231e-01 6.041111e-01 3.784322e-08
